# 1)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
covid <- read_csv("https://covid19.who.int/WHO-COVID-19-global-data.csv")
## Rows: 190074 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country_code, Country, WHO_region
## dbl (4): New_cases, Cumulative_cases, New_deaths, Cumulative_deaths
## date (1): Date_reported
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(covid)
## spec_tbl_df [190,074 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date_reported : Date[1:190074], format: "2020-01-03" "2020-01-04" ...
## $ Country_code : chr [1:190074] "AF" "AF" "AF" "AF" ...
## $ Country : chr [1:190074] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ WHO_region : chr [1:190074] "EMRO" "EMRO" "EMRO" "EMRO" ...
## $ New_cases : num [1:190074] 0 0 0 0 0 0 0 0 0 0 ...
## $ Cumulative_cases : num [1:190074] 0 0 0 0 0 0 0 0 0 0 ...
## $ New_deaths : num [1:190074] 0 0 0 0 0 0 0 0 0 0 ...
## $ Cumulative_deaths: num [1:190074] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. Date_reported = col_date(format = ""),
## .. Country_code = col_character(),
## .. Country = col_character(),
## .. WHO_region = col_character(),
## .. New_cases = col_double(),
## .. Cumulative_cases = col_double(),
## .. New_deaths = col_double(),
## .. Cumulative_deaths = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(covid)
## Date_reported Country_code Country WHO_region
## Min. :2020-01-03 Length:190074 Length:190074 Length:190074
## 1st Qu.:2020-07-21 Class :character Class :character Class :character
## Median :2021-02-06 Mode :character Mode :character Mode :character
## Mean :2021-02-06
## 3rd Qu.:2021-08-26
## Max. :2022-03-14
## New_cases Cumulative_cases New_deaths Cumulative_deaths
## Min. : -32952 Min. : 0 Min. : -60.00 Min. : 0
## 1st Qu.: 0 1st Qu.: 146 1st Qu.: 0.00 1st Qu.: 1
## Median : 22 Median : 9454 Median : 0.00 Median : 128
## Mean : 2403 Mean : 538413 Mean : 31.79 Mean : 10880
## 3rd Qu.: 508 3rd Qu.: 142264 3rd Qu.: 6.00 3rd Qu.: 2445
## Max. :1294746 Max. :78777620 Max. :8786.00 Max. :960144
count(covid)
## # A tibble: 1 × 1
## n
## <int>
## 1 190074
uscovid = filter(covid, Country == 'United States of America') %>% print
## # A tibble: 802 × 8
## Date_reported Country_code Country WHO_region New_cases Cumulative_cases
## <date> <chr> <chr> <chr> <dbl> <dbl>
## 1 2020-01-03 US United Stat… AMRO 0 0
## 2 2020-01-04 US United Stat… AMRO 0 0
## 3 2020-01-05 US United Stat… AMRO 0 0
## 4 2020-01-06 US United Stat… AMRO 0 0
## 5 2020-01-07 US United Stat… AMRO 0 0
## 6 2020-01-08 US United Stat… AMRO 0 0
## 7 2020-01-09 US United Stat… AMRO 0 0
## 8 2020-01-10 US United Stat… AMRO 0 0
## 9 2020-01-11 US United Stat… AMRO 0 0
## 10 2020-01-12 US United Stat… AMRO 0 0
## # … with 792 more rows, and 2 more variables: New_deaths <dbl>,
## # Cumulative_deaths <dbl>
indiacovid <- filter(covid, Country == "India") %>% print
## # A tibble: 802 × 8
## Date_reported Country_code Country WHO_region New_cases Cumulative_cases
## <date> <chr> <chr> <chr> <dbl> <dbl>
## 1 2020-01-03 IN India SEARO 0 0
## 2 2020-01-04 IN India SEARO 0 0
## 3 2020-01-05 IN India SEARO 0 0
## 4 2020-01-06 IN India SEARO 0 0
## 5 2020-01-07 IN India SEARO 0 0
## 6 2020-01-08 IN India SEARO 0 0
## 7 2020-01-09 IN India SEARO 0 0
## 8 2020-01-10 IN India SEARO 0 0
## 9 2020-01-11 IN India SEARO 0 0
## 10 2020-01-12 IN India SEARO 0 0
## # … with 792 more rows, and 2 more variables: New_deaths <dbl>,
## # Cumulative_deaths <dbl>
count(uscovid)
## # A tibble: 1 × 1
## n
## <int>
## 1 802
count(indiacovid)
## # A tibble: 1 × 1
## n
## <int>
## 1 802
#2)
JanAprilUS = uscovid[c(365:484),]
summary(JanAprilUS$Cumulative_cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19818218 25827266 28427792 27672070 30126574 31944206
qqnorm(JanAprilUS$Cumulative_cases)
MayAugUS = uscovid[c(485:607),]
summary(MayAugUS$Cumulative_cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32000132 32931674 33327329 34042957 34738822 38808503
qqnorm(MayAugUS$Cumulative_cases)
SepDecUS = uscovid[c(608:729),]
summary(SepDecUS$Cumulative_cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38941538 43131636 45659813 45696643 48151302 53527519
qqnorm(SepDecUS$Cumulative_cases)
# collect subsets for India
JanAprilInd <- indiacovid[c(365:484),]
MayAugInd <- indiacovid[c(485:607),]
SepDecInd <- indiacovid[c(608:729),]
JanAprilVec <- rep(c(1,2,3,4),times = c(31,28,31,30))
JanAprilUS$month <- JanAprilVec
JanAprilInd$month <- JanAprilVec
MayAugVec <- rep(c(5,6,7,8), times=c(31,30,31,31))
MayAugUS$month <- MayAugVec
MayAugInd$month <- MayAugVec
SepDecVec <- rep(c(9,10,11,12), times=c(30,31,30,31))
SepDecUS$month <- SepDecVec
SepDecInd$month <- SepDecVec
mergedUS <- rbind(JanAprilUS,MayAugUS)
mergedUS <- rbind(mergedUS,SepDecUS)
mergedInd <- rbind(JanAprilInd,MayAugInd)
mergedInd <- rbind(mergedInd,SepDecInd)
group_vec <- rep(c(1,2,3), times=c(120, 123, 122))
mergedUS$group <- group_vec
mergedInd$group <- group_vec
MORrateUS <- mergedUS$Cumulative_deaths/mergedUS$Cumulative_cases
MORrateInd <- mergedInd$Cumulative_deaths/mergedInd$Cumulative_cases
mergedUS$mortrate <- MORrateUS
mergedInd$mortrate <- MORrateInd
# take random sample
sampleUS <- mergedUS %>% group_by(group) %>% slice_sample(n=50)
sampleInd <- mergedInd %>% group_by(group) %>% slice_sample(n=50)
# Cumulative Case plots
ggplot(data=mergedUS, mapping=aes(x=Date_reported, y=Cumulative_cases)) + geom_line()
ggplot(data=mergedInd, mapping=aes(x=Date_reported, y=Cumulative_cases)) + geom_line()
# Cumulative Death plots
ggplot(data=mergedUS, mapping=aes(x=Date_reported, y=Cumulative_deaths)) + geom_line()
ggplot(data=mergedInd, mapping=aes(x=Date_reported, y=Cumulative_deaths)) + geom_line()
# Mortality plots
ggplot(data=mergedUS, mapping=aes(x=Date_reported, y=Cumulative_deaths/Cumulative_cases)) + geom_line()
ggplot(data=mergedInd, mapping=aes(x=Date_reported, y=Cumulative_deaths/Cumulative_cases)) + geom_line()
?filter
## Help on topic 'filter' was found in the following packages:
##
## Package Library
## stats /Library/Frameworks/R.framework/Versions/4.1/Resources/library
## dplyr /Library/Frameworks/R.framework/Versions/4.1/Resources/library
##
##
## Using the first match ...
ggplot(data = JanAprilUS) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) #geom_bar enables bar plotting. Mapping is the dimension set
ggplot(data = MayAugUS) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) #geom_bar enables bar plotting. Mapping is the dimension set
ggplot(data = SepDecUS) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) #geom_bar enables bar plotting. Mapping is the dimension set
ggplot(data = JanAprilInd) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) #geom_bar enables bar plotting. Mapping is the dimension set
ggplot(data = MayAugInd) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) #geom_bar enables bar plotting. Mapping is the dimension set
ggplot(data = SepDecInd) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) #geom_bar enables bar plotting. Mapping is the dimension set
#3)
?aes
#mergedUS <- rbind(JanAprilUS,MayAugUS)
#mergedUS <- rbind(mergedUS,SepDecUS)
#MORrate = mergedUS$Cumulative_deaths/mergedUS$Cumulative_cases
summaryUS = fivenum(MORrateUS)
summaryInd = fivenum(MORrateInd)
# Jan-April (US)
summary(JanAprilUS)
## Date_reported Country_code Country WHO_region
## Min. :2021-01-01 Length:120 Length:120 Length:120
## 1st Qu.:2021-01-30 Class :character Class :character Class :character
## Median :2021-03-01 Mode :character Mode :character Mode :character
## Mean :2021-03-01
## 3rd Qu.:2021-03-31
## Max. :2021-04-30
## New_cases Cumulative_cases New_deaths Cumulative_deaths
## Min. : 30496 Min. :19818218 Min. : 368.0 Min. :355582
## 1st Qu.: 58151 1st Qu.:25827266 1st Qu.: 816.5 1st Qu.:450936
## Median : 70726 Median :28427792 Median :1546.0 Median :521880
## Mean :103029 Mean :27672070 Mean :1849.2 Mean :498533
## 3rd Qu.:138802 3rd Qu.:30126574 3rd Qu.:2780.2 3rd Qu.:553170
## Max. :293313 Max. :31944206 Max. :5074.0 Max. :573727
## month
## Min. :1.00
## 1st Qu.:1.00
## Median :3.00
## Mean :2.50
## 3rd Qu.:3.25
## Max. :4.00
# May-Aug (US)
summary(MayAugUS)
## Date_reported Country_code Country WHO_region
## Min. :2021-05-01 Length:123 Length:123 Length:123
## 1st Qu.:2021-05-31 Class :character Class :character Class :character
## Median :2021-07-01 Mode :character Mode :character Mode :character
## Mean :2021-07-01
## 3rd Qu.:2021-07-31
## Max. :2021-08-31
## New_cases Cumulative_cases New_deaths Cumulative_deaths
## Min. : 8335 Min. :32000132 Min. : 106.0 Min. :574485
## 1st Qu.: 15592 1st Qu.:32931674 1st Qu.: 302.0 1st Qu.:591278
## Median : 32135 Median :33327329 Median : 417.0 Median :600885
## Mean : 55807 Mean :34042957 Mean : 548.1 Mean :601839
## 3rd Qu.: 91196 3rd Qu.:34738822 3rd Qu.: 676.5 3rd Qu.:609946
## Max. :186309 Max. :38808503 Max. :1833.0 Max. :641147
## month
## Min. :5.000
## 1st Qu.:5.500
## Median :7.000
## Mean :6.504
## 3rd Qu.:7.500
## Max. :8.000
# Sept-Dec (US)
summary(SepDecUS)
## Date_reported Country_code Country WHO_region
## Min. :2021-09-01 Length:122 Length:122 Length:122
## 1st Qu.:2021-10-01 Class :character Class :character Class :character
## Median :2021-10-31 Mode :character Mode :character Mode :character
## Mean :2021-10-31
## 3rd Qu.:2021-11-30
## Max. :2021-12-31
## New_cases Cumulative_cases New_deaths Cumulative_deaths
## Min. : 22672 Min. :38941538 Min. : 172 Min. :642350
## 1st Qu.: 81584 1st Qu.:43131636 1st Qu.:1067 1st Qu.:698330
## Median :108496 Median :45659813 Median :1482 Median :744288
## Mean :120648 Mean :45696643 Mean :1451 Mean :737233
## 3rd Qu.:138510 3rd Qu.:48151302 3rd Qu.:1948 3rd Qu.:776525
## Max. :473047 Max. :53527519 Max. :3871 Max. :818171
## month
## Min. : 9.00
## 1st Qu.:10.00
## Median :10.50
## Mean :10.51
## 3rd Qu.:11.75
## Max. :12.00
# Jan-April (India)
summary(JanAprilInd)
## Date_reported Country_code Country WHO_region
## Min. :2021-01-01 Length:120 Length:120 Length:120
## 1st Qu.:2021-01-30 Class :character Class :character Class :character
## Median :2021-03-01 Mode :character Mode :character Mode :character
## Mean :2021-03-01
## 3rd Qu.:2021-03-31
## Max. :2021-04-30
## New_cases Cumulative_cases New_deaths Cumulative_deaths
## Min. : 8635 Min. :10286709 Min. : 77.0 Min. :148994
## 1st Qu.: 13776 1st Qu.:10742920 1st Qu.: 112.2 1st Qu.:154242
## Median : 18200 Median :11118384 Median : 165.0 Median :157202
## Mean : 70802 Mean :11909492 Mean : 496.6 Mean :161203
## 3rd Qu.: 69098 3rd Qu.:12167418 3rd Qu.: 377.0 3rd Qu.:162583
## Max. :386452 Max. :18762976 Max. :3645.0 Max. :208330
## month
## Min. :1.00
## 1st Qu.:1.00
## Median :3.00
## Mean :2.50
## 3rd Qu.:3.25
## Max. :4.00
# May-Aug (India)
summary(MayAugInd)
## Date_reported Country_code Country WHO_region
## Min. :2021-05-01 Length:123 Length:123 Length:123
## 1st Qu.:2021-05-31 Class :character Class :character Class :character
## Median :2021-07-01 Mode :character Mode :character Mode :character
## Mean :2021-07-01
## 3rd Qu.:2021-07-31
## Max. :2021-08-31
## New_cases Cumulative_cases New_deaths Cumulative_deaths
## Min. : 25072 Min. :19164969 Min. : 350 Min. :211853
## 1st Qu.: 39229 1st Qu.:28111289 1st Qu.: 534 1st Qu.:330498
## Median : 46148 Median :30411634 Median :1005 Median :399459
## Mean :113869 Mean :29242628 Mean :1872 Mean :371596
## 3rd Qu.:143444 3rd Qu.:31634908 3rd Qu.:3486 3rd Qu.:424080
## Max. :414188 Max. :32768880 Max. :6148 Max. :438560
## month
## Min. :5.000
## 1st Qu.:5.500
## Median :7.000
## Mean :6.504
## 3rd Qu.:7.500
## Max. :8.000
# Sept-Dec (India)
summary(SepDecInd)
## Date_reported Country_code Country WHO_region
## Min. :2021-09-01 Length:122 Length:122 Length:122
## 1st Qu.:2021-10-01 Class :character Class :character Class :character
## Median :2021-10-31 Mode :character Mode :character Mode :character
## Mean :2021-10-31
## 3rd Qu.:2021-11-30
## Max. :2021-12-31
## New_cases Cumulative_cases New_deaths Cumulative_deaths
## Min. : 5326 Min. :32810845 Min. : 125.0 Min. :439020
## 1st Qu.: 8910 1st Qu.:33772796 1st Qu.: 244.5 1st Qu.:448398
## Median :13074 Median :34279557 Median : 304.0 Median :458312
## Mean :16967 Mean :34135756 Mean : 348.5 Mean :459079
## 3rd Qu.:22739 3rd Qu.:34594538 3rd Qu.: 392.8 3rd Qu.:469180
## Max. :47092 Max. :34838804 Max. :2796.0 Max. :481080
## month
## Min. : 9.00
## 1st Qu.:10.00
## Median :10.50
## Mean :10.51
## 3rd Qu.:11.75
## Max. :12.00
boxplot(Cumulative_cases/Cumulative_deaths~group,data=mergedUS)
boxplot(Cumulative_cases/Cumulative_deaths~group,data=mergedInd)
#mergedUS$mortrate <- MORrate
# Cumulative Cases (US and India)
summaryU <- mergedUS %>% group_by(group) %>% summarise(cum_cases=mean(Cumulative_cases))
mergedplot <- ggplot(summaryU, aes(x=group, y=cum_cases, fill=Cumulative_cases)) + geom_col(fill="#0099f9") + scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))
mergedplot
summaryI <- mergedInd %>% group_by(group) %>% summarise(cum_cases=mean(Cumulative_cases))
mergedplot1 <- ggplot(summaryI, aes(x=group, y=cum_cases, fill=Cumulative_cases)) + geom_col(fill="#0099f9") + scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))
mergedplot1
# Cumulative Deaths
summaryU2 <- mergedUS %>% group_by(group) %>% summarise(cum_deaths=mean(Cumulative_deaths))
mergedplot2 <- ggplot(summaryU2, aes(x=group, y=cum_deaths, fill=Cumulative_deaths)) + geom_col(fill="#0099f9") + scale_y_continuous(labels = scales::unit_format(unit = "k", scale = 1e-3))
mergedplot2
summaryI <- mergedInd %>% group_by(group) %>% summarise(cum_deaths=mean(Cumulative_deaths))
mergedplot3 <- ggplot(summaryI, aes(x=group, y=cum_deaths, fill=Cumulative_deaths)) + geom_col(fill="#0099f9") + scale_y_continuous(labels = scales::unit_format(unit = "k", scale = 1e-3))
mergedplot3
# Case Mortality plots
summaryU4 <- mergedUS %>% group_by(group) %>% summarise(case_mort=mean(Cumulative_deaths/Cumulative_cases))
mergedplot4 <- ggplot(summaryU4, aes(x=group, y=case_mort, fill=case_mort)) + geom_col(fill="#0099f9")
mergedplot4
summaryU5 <- mergedInd %>% group_by(group) %>% summarise(case_mort=mean(Cumulative_deaths/Cumulative_cases))
mergedplot5 <- ggplot(summaryU5, aes(x=group, y=case_mort, fill=case_mort)) + geom_col(fill="#0099f9")
mergedplot5
#4)
#install.packages(c("ggplot2", "ggpubr", "tidyverse", "broom", "AICcmodavg"))
library(ggplot2)
#library(ggpubr)
library(tidyverse)
library(broom)
## Warning: package 'broom' was built under R version 4.1.2
library(AICcmodavg)
rootmergedUS <- sampleUS
rootmergedUS$Cumulative_cases <- (sampleUS$Cumulative_cases)^(1/2)
rootmergedUS$Cumulative_deaths <- (sampleUS$Cumulative_deaths)^(1/2)
rootmergedUS$mortrate <- rootmergedUS$Cumulative_cases/rootmergedUS$Cumulative_deaths
rootmergedInd <- sampleInd
rootmergedInd$Cumulative_cases <- (sampleInd$Cumulative_cases)^(1/2)
rootmergedInd$Cumulative_deaths <- (sampleInd$Cumulative_deaths)^(1/2)
rootmergedInd$mortrate <- rootmergedInd$Cumulative_cases/rootmergedInd$Cumulative_deaths
summary(uscovid$Cumulative_cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 3866850 26664440 24968869 37951204 78777620
qqnorm(uscovid$Cumulative_cases)
summary(rootmergedUS$Cumulative_cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4452 5509 5762 5937 6612 7316
qqnorm(rootmergedUS$Cumulative_cases)
summary(mergedUS$mortrate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01529 0.01627 0.01759 0.01727 0.01801 0.01848
qqnorm(mergedUS$mortrate)
summary(rootmergedUS$mortrate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.355 7.453 7.521 7.618 7.842 8.088
qqnorm(rootmergedUS$mortrate)
summary(indiacovid$Cumulative_cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1164622 10820334 16482188 32546989 42993494
qqnorm(indiacovid$Cumulative_cases)
summary(rootmergedInd$Cumulative_cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3207 3444 5496 4890 5822 5902
qqnorm(rootmergedInd$Cumulative_cases)
summary(mergedInd$mortrate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01086 0.01317 0.01339 0.01324 0.01374 0.01448
qqnorm(mergedInd$mortrate)
# model selection
mergedcountry <- rbind(rootmergedUS,rootmergedInd)
mergedcountry <- mergedcountry %>% group_by(group) %>% slice_sample(n=50)
# One-way ANOVA
usaov <- aov(Cumulative_cases ~ group, data = rootmergedUS)
summary(usaov)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 62854946 62854946 750.7 <2e-16 ***
## Residuals 148 12391414 83726
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Two-way ANOVA
two.way <- aov(Cumulative_cases ~ group + Country, data = mergedcountry)
summary(two.way)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 103325632 103325632 641.6 <2e-16 ***
## Country 1 44279296 44279296 274.9 <2e-16 ***
## Residuals 147 23674962 161054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#5)
# Determine best model
AIC(usaov, two.way)
## df AIC
## usaov 3 2129.963
## two.way 4 2229.076
#6)
# Evaluate the normality of a Cumulative Cases using a Q-Q plot
plot(usaov)
plot(two.way)
# Shapiro Test
onewayres<-resid(usaov)
shapiro.test(onewayres)
##
## Shapiro-Wilk normality test
##
## data: onewayres
## W = 0.97519, p-value = 0.008124
twowayres <- resid(two.way)
shapiro.test(twowayres)
##
## Shapiro-Wilk normality test
##
## data: twowayres
## W = 0.95414, p-value = 7.341e-05
In this project, We are going to analyze multiple factors that we think may have impacted the overall spread of corona virus in 2021 in order to identify and analyze the prime variables resposible for the increase in cumulative cases, deaths, and overall mortality rate for the year 2021. We will first run five number summaries of the variables we analyze in order to find patterns or observations about the data. Then, we will use ANOVA modeling to investigate the randomly sampled (n=50 for each variable) variables time (as a matter of 3 groups for the year 2021. The groups are denoted as Group 1: January-April, Group 2: May-August, Group 3: September-December) and country (2 countries: United States of America and India) We will use covid data provided by WHO from a dataset that has regular daily updates. We will experiment with multiple ANOVA models but conclude with valid ANOVA models that yiueld fruitful results about the variables analyzed. This analysis will cover wether if any of the variables (time and country) have an effect of overall covid spread that is denoted in multiple ways (cumulative deaths, cumulative cases, death rate). The dataset (from WHO) used to derive these models hold covid data about the coronavirus throughout the world since its outbreak, and we will extract the entries of covid cases spefically in the year 2021 and in the countries of US and India. We will split these US and India covid entries into 6 datasets, each dataset having an interval of 4 months. These 4 month intervals are January-April, May-August, and September-December. After filtering our data into three distinct datasets, we will will examine the impact of time and country on cumulative cases in the US during the year 2021 via the means of the “Cumulative_cases”, “Cumulative_death” and a “mortrate” (Death Rate) variable of each country’s dataset. We will utilize any transformations necessary on the data in order to enable validity of all assumptions in our ANOVA models, some verified by normality tests and other analyses of residual plots (QQplots, etc). After running multiple ANOVA models and verifying their assumptions we will determine which models (created from which variables) can most effectively explain our covid variables in the year 2021. If we find fruitful results about the time of year or country in terms of their impact on the covid variables from the WHO dataset in the year 2021, we can draw conclusions about these variables that can perhaps help the general public achieve higher protection from covid and its effects. We will draw these concusions that answer the following questions of interest.
The results of our project can help us determine the conditions where COVID spreads much faster based on data found from the US and India in the year 2021, and allow the general public to make improved adjustments in their quarantine plans and overall safety.
SARS-CoV-2 is a virus that was exposed to humanity around the ending months of 2019 and proceeded to cause an entire pandemic through the year 2020, effectively disrupting society and forcing people to quarantine for their safety even today. Ever since then, WHO has made a dataset that holds information/statisttics about the cases and deaths caused by COVID from 2019 to today. This dataset is daily updated by WHO and reflects the most current status of COVID from its outbreak to the present, as COVID is still a prominent problem today. We will analyze this datset in order to study the background around the covid pandemic further and see if there are any possible patterns emitted by this pandemic.
We will explore the WHO COVID-19 data in this project. You can take a look at the weekly WHO COVID-19 update for reference. ## CODE BLOCK
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01529 0.01627 0.01759 0.01727 0.01801 0.01848
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01086 0.01317 0.01339 0.01324 0.01374 0.01448
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 355582 450936 521880 498533 553170 573727
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19818218 25827266 28427792 27672070 30126574 31944206
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01725 0.01762 0.01805 0.01798 0.01836 0.01848
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 574485 591278 600885 601839 609946 641147
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32000132 32931674 33327329 34042957 34738822 38808503
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01652 0.01756 0.01792 0.01770 0.01799 0.01803
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 642350 698330 744288 737233 776525 818171
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38941538 43131636 45659813 45696643 48151302 53527519
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01529 0.01613 0.01620 0.01615 0.01627 0.01650
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 148994 154242 157202 161203 162583 208330
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10286709 10742920 11118384 11909492 12167418 18762976
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01110 0.01336 0.01414 0.01369 0.01436 0.01448
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 211853 330498 399459 371596 424080 438560
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19164969 28111289 30411634 29242628 31634908 32768880
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01086 0.01176 0.01314 0.01261 0.01340 0.01341
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 439020 448398 458312 459079 469180 481080
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32810845 33772796 34279557 34135756 34594538 34838804
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01327 0.01329 0.01338 0.01345 0.01356 0.01381
(2021 Death rate) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01529 0.01627 0.01759 0.01727 0.01801 0.01848
(JanApril Cumulative covid deaths)
Min. 1st Qu. Median Mean 3rd Qu. Max. 355582 450936 521880 498533 553170 573727
(JanApril Cumulative covid cases) Min. 1st Qu. Median Mean 3rd Qu. Max. 19818218 25827266 28427792 27672070 30126574 31944206
(JanApril Death Rate) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01725 0.01762 0.01805 0.01798 0.01836 0.01848
(MayAug Cumulative covid deaths)
Min. 1st Qu. Median Mean 3rd Qu. Max. 574485 591278 600885 601839 609946 641147
(MayAug Cumulative covid cases) Min. 1st Qu. Median Mean 3rd Qu. Max. 32000132 32931674 33327329 34042957 34738822 38808503
(MayAug Death Rate)
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01652 0.01756 0.01792 0.01770 0.01799 0.01803
(SepDec Cumulative covid deaths)
Min. 1st Qu. Median Mean 3rd Qu. Max. 642350 698330 744288 737233 776525 818171
(SepDec Cumulative covid cases)
Min. 1st Qu. Median Mean 3rd Qu. Max.
38941538 43131636 45659813 45696643 48151302 53527519
(SepDec Death Rate)
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01529 0.01613 0.01620 0.01615 0.01627 0.01650
(2021 Death rate) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01086 0.01317 0.01339 0.01324 0.01374 0.01448
(JanApril Cumulative covid deaths)
Min. 1st Qu. Median Mean 3rd Qu. Max. 148994 154242 157202 161203 162583 208330
(JanApril Cumulative covid cases) Min. 1st Qu. Median Mean 3rd Qu. Max. 10286709 10742920 11118384 11909492 12167418 18762976
(JanApril Death Rate) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01110 0.01336 0.01414 0.01369 0.01436 0.01448
(MayAug Cumulative covid deaths)
Min. 1st Qu. Median Mean 3rd Qu. Max. 211853 330498 399459 371596 424080 438560
(MayAug Cumulative covid cases) Min. 1st Qu. Median Mean 3rd Qu. Max. 19164969 28111289 30411634 29242628 31634908 32768880
(MayAug Death Rate)
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01086 0.01176 0.01314 0.01261 0.01340 0.01341
(SepDec Cumulative covid deaths)
Min. 1st Qu. Median Mean 3rd Qu. Max. 439020 448398 458312 459079 469180 481080
(SepDec Cumulative covid cases)
Min. 1st Qu. Median Mean 3rd Qu. Max. 32810845 33772796 34279557 34135756 34594538 34838804
(SepDec Death Rate)
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.01327 0.01329 0.01338 0.01345 0.01356 0.01381
When skimming at the number summaries, it seems that the overall variables for group seem to differ but not as much for country. This is a mere observation which will be verified if correct or not with models and plots specifically for covid cases
Plots
ggplot(data = JanAprilUS) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) + ggtitle("January to April Cumulative Cases 2021 US") #geom_bar enables bar plotting. Mapping is the dimension set
This graph shows the increase in Cumulative covid cases in 2021 from January to April in the US. We can see that there is a sharp spike in this first graph that could be due to the time year
ggplot(data = MayAugUS) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases))+ ggtitle("May to August Cumulative Cases 2021 US") #geom_bar enables bar plotting. Mapping is the dimension set
This graph shows the increase in Cumulative covid cases in 2021 from May to August in the US. We can see that there is a smaller spike in cases from May to July compared to earlier in the year but picks up in September. This could be an interesting pattern exhbited by the variable time of year but more evidence is necessary.
ggplot(data = SepDecUS) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) + ggtitle("September to Decemeber Cumulative Cases 2021 US") #geom_bar enables bar plotting. Mapping is the dimension set
This graph shows the increase in Cumulative covid cases in 2021 from September to December in the US. We can see that there is another smaller spike in the rise of covid cases contuing on from september but that spike grows larger and larger and does not stop increasing. There could be a reason as to why the spread of covid increases at certain times and isn’t as high at other times.
ggplot(data = JanAprilInd) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) + ggtitle("January to April Cumulative Cases 2021 India")#geom_bar enables bar plotting. Mapping is the dimension set
This graph shows the increase in Cumulative covid cases in 2021 from January to April in India We can see that there is a smaller spike that ends up drasticvally rising at march. This could also be due to time of year but also shows a different pattern of spread than the US numbers wise and spread wise.
ggplot(data = MayAugInd) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) + ggtitle("May to August Cumulative Cases 2021 India") #geom_bar enables bar plotting. Mapping is the dimension set
This graph shows the increase in Cumulative covid cases in 2021 from May to August in India. We can see an intense surge in covid cases around this time especially from May to June, this spread picks up the lack of spread much earlier in the year as shown by the previous graph. The number of cases reaches roughly around the same as the US at this time
ggplot(data = SepDecInd) + geom_line(mapping = aes(x= Date_reported, y= Cumulative_cases)) + ggtitle("September to Decemeber Cumulative Cases 2021 India")#geom_bar enables bar plotting. Mapping is the dimension set
This graph shows the increase in Cumulative covid cases in 2021 from September to December in India. We can see the spread of covid in india steadily rises but not as a spike but rather a loose logarithm curve pattern. This trend was not seen in the US graph at around this time which can open the door to analyze some patterns.
boxplot(Cumulative_deaths/Cumulative_cases~group,data=mergedUS)
This boxplot shows the distributions of the data of death for each group of time in the US of 2021 It seems that there are more outliers in the last group, which may affect the assumptions for ANOVA we must verify for.
boxplot(Cumulative_deaths/Cumulative_cases~group,data=mergedInd)
This boxplot shows the distributions of the data of death for each group of time in the India of 2021 It seems that there are more outliers in the first group, which may affect the assumptions for ANOVA we must verify for.
# Cumulative Cases (US and India)
summaryU <- mergedUS %>% group_by(group) %>% summarise(cum_cases=mean(Cumulative_cases))
mergedplot <- ggplot(summaryU, aes(x=group, y=cum_cases, fill=Cumulative_cases)) + geom_col(fill="#0099f9") + ggtitle("Cumulative Cases 2021 US") + scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))
mergedplot
summaryI <- mergedInd %>% group_by(group) %>% summarise(cum_cases=mean(Cumulative_cases))
mergedplot1 <- ggplot(summaryI, aes(x=group, y=cum_cases, fill=Cumulative_cases)) + geom_col(fill="#0099f9") + ggtitle("Cumulative Cases 2021 India") + scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))
mergedplot1
# Cumulative Deaths
summaryU2 <- mergedUS %>% group_by(group) %>% summarise(cum_deaths=mean(Cumulative_deaths))
mergedplot2 <- ggplot(summaryU2, aes(x=group, y=cum_deaths, fill=Cumulative_deaths)) + geom_col(fill="#0099f9") + ggtitle("Cumulative Deaths 2021 US") + scale_y_continuous(labels = scales::unit_format(unit = "k", scale = 1e-3))
mergedplot2
summaryI3 <- mergedInd %>% group_by(group) %>% summarise(cum_deaths=mean(Cumulative_deaths))
mergedplot3 <- ggplot(summaryI3, aes(x=group, y=cum_deaths, fill=Cumulative_deaths)) + geom_col(fill="#0099f9") + scale_y_continuous(labels = scales::unit_format(unit = "k", scale = 1e-3)) + ggtitle("Cumulative Deaths 2021 India")
mergedplot3
These are some bar plots for the cumulative deaths of each group of time for each country. Looking at each group of each country, we can observe the trend of increasing cumulative cases per group as different rates for each country. The fact that India’s cases rise much faster on a group by group basis may indicate potential difference in the spread of covid in different hemispheres. The static increases of cases in groups in general means that time is influencing covid spread at different rates, and that spread is resulting in subsequent covid deaths shown here by this graph. This shows that death is not as good as an indicator of covid spread for different reasons, perhaps mainly its lack of involvement in the overall spread of covid given its mere defintiion of patient critical conditions
# Case Mortality plots
summaryU4 <- mergedUS %>% group_by(group) %>% summarise(case_mort=mean(Cumulative_deaths/Cumulative_cases))
mergedplot4 <- ggplot(summaryU4, aes(x=group, y=case_mort, fill=case_mort)) + geom_col(fill="#0099f9") + ggtitle("Death Rate 2021 US")
mergedplot4
summaryI5 <- mergedInd %>% group_by(group) %>% summarise(case_mort=mean(Cumulative_deaths/Cumulative_cases))
mergedplot5 <- ggplot(summaryI5, aes(x=group, y=case_mort, fill=case_mort)) + geom_col(fill="#0099f9") + ggtitle("Death Rate 2021 India")
mergedplot5
These are some bar plots for the death rate of each group of time for each country. Looking at each group of each country, we can observe that the death rate tends to vary both country wise and group of time wise. the groups increase and decrease randomly as time progresses and country wise. This indicates that death rate is also affected by the variables we aim to analyze (and others) in addition to cumulative deaths.
##Models (Two Way and One Way ANOVA models)
We will now run our ANOVA models on mean cumulative cases by randomly sampling 6 groups from our 6 datasets split based on groups of time of year and country with 50 data points in each sample. We will first verify the assumptions of our ANOVA models to determine if they can validly analyze the data. We will run a one way ANOVA model with groups of time as our factor variables. After verifying our assumptions, we will run new model with country and determine if that corresponding two-way model is better than our initial one way ANOVA model with group of time as the factor. In order to proceed with our one way model, we must verify its assumptions first.
One Way ANOVA model: We will be using single factor ANOVA to answer our first question of interest. Our notation will be the standard notation used in single factor ANOVA, with F tests and p-values to carry our analysis. The factor we will employ is the time of the year denoted by the intervals of our datasets, and the groups we will analyze are the “Cumulative_cases” variable with three groups sampled from datasets that consist of COVID cases for the US in the year 2021. Single factors ANOVA will do this by comparing the means of the cumualtive cases of the group samples and determining wether there is a signficant difference between the group means of the three datasets.
Assumptions of Single Factor ANOVA (and verifications): 1Individual samples are taken from normally distributed population (verified with Shapiro-Wilks test that with a failed to reject null, verifying this assumption. In addition, n=50 which is greater than 30 per group)
Shapiro-Wilk normality test: Ho: Population is normally distributed Ha: Population is not normally distributed data: onewayres W = 0.98242, p-value = 0.05231 Fail to reject Ho
-Individual samples are independently drawn (observations are randomly sampled, verifying this assumption) - Variance is the same across different groups (Verified with residual and QQ plots and will be elaborated in sensitivity analysis, along with a square root transformation of the data) - Continuous dependent variable (cumulative COVID cases)
Let us proceed with our one way ANOVA model: Here is our null hypotheses:
H0: There is no difference between the means of cumulative covid cases of the time group samples HA: At least one group sample of time is different from one another alpha = .01 or
𝐻0:𝜇1=𝜇2=⋯=𝜇𝑟
against the alternative 𝐻1:not all 𝜇𝑖 are the same.
usaov <- aov(Cumulative_cases ~ group, data = rootmergedUS)
summary(usaov)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 62854946 62854946 750.7 <2e-16 ***
## Residuals 148 12391414 83726
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Df Sum Sq Mean Sq F value Pr(>F)
group 1 60410547 60410547 872.8 <2e-16 *** Residuals 148 10243610 69214
— Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1
With a p-value less than <.001, much below the alpha level, we can reject the hypothesis that there is no difference between the means of cumulative covid cases of the time group samples and conclude that at least one group sample of time is different from one another
We have a fruitful model that provides evidence for the impact of time on COVID spread. Now let us run a two-way ANOVA model that has both group of time and country as factors in the model. However to proceed with the model with any accuracy we must make sure the assumptions are verified.
Assumptions of Two Factor ANOVA (with verifications not satisfied!): 1. Individual samples are taken from normally distributed population (Not verifiable since the Shapiro-Wilks test conducted provides a p-value much lower than the signficance level, meaning that this model is invalid. We no longer look to this model for fruitful results but as a means for comparison for our valid one way ANOVA model elaborated further in the sensitivity analysis.)
Shapiro-Wilk normality test Ho: Population is normally distributed Ha: Population is not normally distributed data: twowayres W = 0.95615, p-value = 0.0001101 Reject Ho
-Individual samples are independently drawn (observations are randomly sampled, verifying this assumption) - Variance is the same across different groups (Not verified despite square root transformation being applied on the data) - Continuous dependent variable (cumulative COVID cases)
Since we cannot verify our two way model’s assumptions, we will use its calculations in a means to enhance our one way model in the sensitivity analysis section
# Two-way ANOVA
two.way <- aov(Cumulative_cases ~ group + Country, data = mergedcountry)
summary(two.way)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 1 103325632 103325632 641.6 <2e-16 ***
## Country 1 44279296 44279296 274.9 <2e-16 ***
## Residuals 147 23674962 161054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#5)
# Determine best model
AIC(usaov, two.way)
## df AIC
## usaov 3 2129.963
## two.way 4 2229.076
# Evaluate the normality of a Cumulative Cases using a Q-Q plot
plot(usaov)
plot(two.way)
onewayres<-resid(usaov)
shapiro.test(onewayres)
##
## Shapiro-Wilk normality test
##
## data: onewayres
## W = 0.97519, p-value = 0.008124
twowayres <- resid(two.way)
shapiro.test(twowayres)
##
## Shapiro-Wilk normality test
##
## data: twowayres
## W = 0.95414, p-value = 7.341e-05
Square Root Information:
In order to make sure our assumptions were met for our one way ANOVA model, we employed a simple square root transformation on the cumulative covid cases variable across group of time and country. We did this because our initial test of Shapiro-Wilks on the regular variable did not yield a conclusion that said the population was normally distributed. Upon applying the transformation, our p-value for the resulting Shapiro-Wilks test went up greatly, allowing us to conclude that the popultion that the dataset was drawn from was normally distributed. Thanks to this test and central limit theorem with our group samples being no less than 50, (the theorem states that the larger a datset is the more normally distributed it will be especially when the sample size is greater than 30), we were able to verify our assumptions for the One Way Model. However, even when we applied a square root transformation on the 2021 data for the two way model, our p-value for the Shapiro-Wilks test was extremely low, indicating that the population is not normally distrivuted for the population being studied of this two-way ANOVA model. We believe that this was better in general as the AIC for the one way ANOVA model was much lower than that of the two way ANOVA model, further porving the benefits of our valid One Way ANOVA model.
AIC information:
The AIC for our one way (usaov) model is 2101.411 with df = 3 while the two.way model has an AIC of 2229.520 with df=4. Since a lower AIC is better, this gives us more evidence that a usaov model will fare much better with the explanation of our target covid variable cumulative covid cases.
Residual/QQ plots:
For the plots of the one way ANOVA model specifically, we see that the fitted values are distributed via the group they are inm, and the group variances look realitvely equal for some groups, especialyl groups 1 and 3. group 2 could have a more equal variance, but overall it seems that our one way ANOVA model enjoys the verficaiton of equal group variance based on our observations of the residual vs fitted model graph and scale location graphs. The QQplot also suggests that the data approximately matches the theoretical quantiles with the exception of the edge points, indicating the variablity in the model but also does not bias the model too much. There does not seem to be any leverage points in the one way ANOVA model except for some that reach the edge of that definition from Cooks distance. However, the plots of each model in one way ANOVA seem to have their assumptions more satsified then the plots in the two way model. From the QQplot to the outliers, the data for the model in two way ANOVA tends to be more biased at least in comparison to the one way ANOVA model.
Our conclusions are simple yet robust. We will first begin by answering our questions of interest. We answer our first question by running summary statistics across the 6 datasets in order to study the 5 numbers of each group of time factor level and country level. We find that their tends to be larger differences of cumulative covid cases especially in number summaries across groups of time in each country but there are similarities of the covid cases variable changes when it is just country. We saw that this is not the case for covid deaths and death rate, as that varies across groups of time and country in different directions. We analyzed the distrbutions of each of the 6 datasets and found that usually a group or two will tend to have a couple of outliers, but nothing too drastic to skew the dataset heavily.
We answer our second question of interest on wether group of time as a factor has an impact of overall mean cumulative covid cases through ANOVA modeling. We found that the groups of time in the US of the year 2021 had at least one group mean that was differnt from the rest, indicating that the time of the year does have a signficant impact on covid case and overall covid spread. This means that our second question about the factor group of times and its effect on cumulative covid cases was answered, and we could combine that result with our other results answered in question 3 to determine its impact.
We answer our third question on wether the time factor and country factor both have an effect on mean cumulative covid case by finding that it is not possible. Partly due to the inability to verify the assumptions of a two-way model, we were unable to figure out a way in which time and country both significantly impact mean cumulative covid cases. This is further proven by the lower AIC shown by our one way ANOVA model compared to the run two way model. We answer this quesiton further by concluding that the time of year does indeed have an effect on mean cumulative covid cases, which means that actions can be taken given these results. We know that the mean cumulative covid cases variable will increase at different times, (perhaps higher in the winters and colder seasons), so we can efectively practice more quaranting skills during those times of high spread, while being more relaxed of quarantine during other seasons.
Our warnings and caveats to our approach include the our inability to verify our assumptions like the two way ANOVA model. We also performed a square root transfromation onto the data which naturally skews the reality of the data. In the future we can polish our model data so it can pass our model assumptions without the use of powerful transformations which, while useful, may skew the true dataset. We also could do more with the one way ANOVA model, specifically determining which group of times have higher cumulative covid cases compared to others, allowing more elaboration of results. We also could determine the parameters of our data better in the future. Overall, this report yieled fruitful results that provides viable conclusions about factors like time and how they impact covid in 2021 that can potentially help public health and safety for corona today.
Acknowledgements: Udirno Chaudhuri Adarsh Pantula